#
# File for processing and preparing raw IPUMS data for use in analysis.R
# Article: "Group Ties amid Industrial Change" (accepted at World Politics)
# Author: Noah Zucker (noah.zucker@columbia.edu)
#
# Software: R version 3.6.2 / RStudio version 1.2.5033 / macOS 12.1
# Hardware: MacBook Pro (16-inch, 2019) / Processor: 2.4 GHz 8-Core Intel Core i9 / RAM: 64 GB 2667 MHz DDR4
#
# Last edited May 8, 2022
#

# ******************* #
##################### #
#### SESSION SETUP ####
##################### #
# ******************* #

rm(list = ls()); gc()

set.seed(1)

#install.packages("pacman")
pacman::p_load(tidyverse,
               tidylog,
               AER,
               broom,
               data.table,
               DeclareDesign,
               ggpubr,
               ggthemes,
               ggmap,
               interflex,
               janitor,
               lfe,
               lmtest,
               magrittr,
               patchwork,
               pryr,
               RColorBrewer,
               readxl,
               rgdal,
               rgeos,
               sf,
               stargazer,
               texreg,
               xtable)

# > Function for determining immigrant ethnicity ####

ethnicity <- function(bpl = NA, bpld = NA, mbpl = NA, fbpl = NA, race = NA, hispan = NA, mtongue = NA, mtongued = NA){
  
  GROUP <- case_when(bpl == 400 ~ "danish",
                     bpl == 401 ~ "finnish",
                     bpl == 402 ~ "icelandic",
                     bpl == 404 ~ "norwegian",
                     bpl == 405 ~ "swedish",
                     bpl == 410 ~ "english",
                     bpl == 411 ~ "scottish",
                     bpl == 412 ~ "welsh",
                     bpl == 414 ~ "irish",
                     bpl == 420 ~ "belgian",
                     bpl == 421 ~ "french",
                     bpl == 423 ~ "luxembourgish",
                     bpl == 425 ~ "dutch",
                     bpl == 426 ~ "swiss",
                     bpl == 433 ~ "greek",
                     bpl == 434 ~ "italian",
                     bpl == 436 ~ "portuguese",
                     bpl == 438 ~ "spanish",
                     bpl == 450 ~ "austrian",
                     bpl == 451 ~ "bulgarian",
                     bpl == 452 ~ "czechoslovak",
                     bpl == 453 ~ "german",
                     bpl == 454 ~ "hungarian",
                     bpl == 455 ~ "polish",
                     bpl == 456 ~ "rumanian",
                     bpld == 45710 ~ "croatian",
                     bpld == 45730 ~ "serbian",
                     bpld == 45780 ~ "slovenian",
                     bpld == 46000 ~ "estonian",
                     bpld == 46100 ~ "latvian",
                     bpld == 46200 ~ "lithuanian",
                     bpld %in% c(46500:46521, 46548) ~ "russian",
                     bpld == 46530 ~ "ukrainian",
                     bpld == 46540 ~  "armenian",
                     TRUE ~ NA_character_)
  
  GROUP_REV <- case_when(bpl %in% 400:499 & mtongued == 220 ~ "swiss",
                         bpl %in% 400:499 & mtongued == 210 ~ "austrian",
                         bpl %in% 400:499 & mtongued == 2610 ~ "bulgarian",
                         bpl %in% 400:499 & mtongue == 20 ~ "czech",
                         bpl %in% 400:499 & mtongue == 22 ~ "slovak",
                         bpl %in% 400:499 & mtongue == 2 ~ "german",
                         bpl %in% 400:499 & mtongue == 34 ~ "hungarian",
                         bpl %in% 400:499 & mtongue == 21 ~ "polish",
                         bpl %in% 400:499 & mtongue == 14 ~ "rumanian",
                         bpl %in% 400:499 & mtongued == 2310 ~ "croatian",
                         bpl %in% 400:499 & mtongued == 2320 ~ "serbian",
                         bpl %in% 400:499 & mtongued %in% 2330:2332 ~ "dalm./mont.",
                         bpl %in% 400:499 & mtongued == 2400 ~ "slovenian",
                         bpl %in% 400:499 & mtongued == 2500 ~ "lithuanian",
                         bpl %in% 400:499 & mtongue == 18 ~ "russian",
                         bpl %in% 400:499 & mtongued == 1930 ~ "ukrainian",
                         bpl %in% 400:499 & mtongued == 2800 ~ "armenian",
                         bpl %in% 400:499 & mtongue == 3 | mtongue == 59 ~ "jewish",
                         TRUE ~ GROUP)
  
  GROUP_FIN <- if_else(GROUP != GROUP_REV, GROUP_REV, GROUP)
  
  GROUP_FIN <- if_else(bpl %in% 1:56 & mbpl %in% 1:56 & fbpl %in% 1:56 & race == 1 & hispan == 0, "nativewhite", GROUP_FIN)
  
  GROUP_FIN <- if_else(bpl %in% 1:56 & race == 2, "nativeblack", GROUP_FIN)
  
  return(GROUP_FIN)
  
}

# ***************************************************** #
####################################################### #
#### *** REGRESSION WEIGHTS AND LINKAGE SKELETON *** ####
####################################################### #
# ***************************************************** #

# > Identifying coal producing counties ####

usgs_yearbooks_coal <- read_excel("usgs_mineral_yearbooks.xlsx",
                                  sheet = "County production") %>%
  mutate(multiple_counties = str_count(County, "\\bAND\\b")) %>% # identifying production data points that cover multiple counties
  mutate(multiple_counties = ifelse(is.na(multiple_counties), 0, multiple_counties))

usgs_yearbooks_coal <- usgs_yearbooks_coal[rep(seq_len(nrow(usgs_yearbooks_coal)), 
                                               usgs_yearbooks_coal$multiple_counties + 1), 
                                           1:ncol(usgs_yearbooks_coal)] %>% # reconstructing dataset at county-year level
  group_by(Year, State, County) %>% 
  mutate(group_no = row_number()) %>%
  ungroup() %>%
  mutate(Production = replace_na(Production, 0)) %>% 
  mutate(Production = ifelse(multiple_counties == 0, Production, NA)) %>% # eliminating multiple-county observations
  mutate(Production = ifelse(Mineral == "anthracite" & Production == 0, NA, Production))

usgs_yearbooks_coal$group_id <- usgs_yearbooks_coal %>% group_indices(Year, State, County)

usgs_yearbooks_coal %<>%
  mutate(County = ifelse(group_no > 1, str_extract(County, "(?<=AND\\s)\\w[^-]*"), County)) %>%
  mutate(County = ifelse(group_no > 2, str_extract(County, "(?<=AND\\s)\\w[^-]*"), County)) %>%
  mutate(County = ifelse(group_no > 3, str_extract(County, "(?<=AND\\s)\\w[^-]*"), County)) %>%
  mutate(County = ifelse(group_no > 4, str_extract(County, "(?<=AND\\s)\\w[^-]*"), County)) %>%
  mutate(County = ifelse(group_no > 5, str_extract(County, "(?<=AND\\s)\\w[^-]*"), County)) %>%
  mutate(County = ifelse(group_no > 6, str_extract(County, "(?<=AND\\s)\\w[^-]*"), County)) %>%
  mutate(County = ifelse(group_no > 7, str_extract(County, "(?<=AND\\s)\\w[^-]*"), County)) %>%
  mutate(County = ifelse(group_no > 8, str_extract(County, "(?<=AND\\s)\\w[^-]*"), County)) %>%
  mutate(County = ifelse(group_no > 9, str_extract(County, "(?<=AND\\s)\\w[^-]*"), County)) %>%
  mutate(County = ifelse(group_no > 10, str_extract(County, "(?<=AND\\s)\\w[^-]*"), County)) %>%
  mutate(County = ifelse(group_no > 11, str_extract(County, "(?<=AND\\s)\\w[^-]*"), County)) %>%
  mutate(County = ifelse(group_no > 12, str_extract(County, "(?<=AND\\s)\\w[^-]*"), County)) %>%
  mutate(County = ifelse(group_no > 13, str_extract(County, "(?<=AND\\s)\\w[^-]*"), County)) %>%
  mutate(County = ifelse(group_no > 14, str_extract(County, "(?<=AND\\s)\\w[^-]*"), County)) %>%
  mutate(County = ifelse(group_no > 15, str_extract(County, "(?<=AND\\s)\\w[^-]*"), County)) %>%
  mutate(County = ifelse(group_no > 16, str_extract(County, "(?<=AND\\s)\\w[^-]*"), County)) %>%
  mutate(County = ifelse(group_no > 17, str_extract(County, "(?<=AND\\s)\\w[^-]*"), County)) %>%
  mutate(County = ifelse(group_no > 18, str_extract(County, "(?<=AND\\s)\\w[^-]*"), County)) %>%
  mutate(County = ifelse(group_no > 19, str_extract(County, "(?<=AND\\s)\\w[^-]*"), County)) %>%
  mutate(County = str_remove(County, " AND.*"))

usgs_yearbooks_coal %<>%
  mutate_at(vars(Production), ~ ./(multiple_counties + 1)) %>%
  select(-group_no, -group_id)

.county_codes <- read_delim("icpsrcnt_pre1930.txt", # loading ICPSR county codes
                            delim = "|") %>%
  mutate(SCICP = as.factor(paste0(STATEICP, "-", COUNTYICP))) %>%
  select(State, County, SCICP) %>%
  mutate(County = tolower(County))

usgs_yearbooks_coal %<>% # fixing county name spellings
  mutate(County = ifelse(State == "Illinois" & County == "Marlon", "Marion", County),
         County = ifelse(State == "Illinois" & County == "Lasalle", "La Salle", County),
         County = ifelse(State == "Indiana" & (County == "Vanderburg" | County == "Vanderberg"), "Vanderburgh", County),
         County = ifelse(State == "Indiana" & County == "Vermilion", "Vermillion", County),
         County = ifelse(State == "Missouri" & County == "Charitan", "Chariton", County),
         County = ifelse(State == "Montana" & County == "Choteau", "Chouteau", County),
         County = ifelse(State == "Pennsylvania" & County == "Center", "Centre", County),
         County = ifelse(State == "Utah" & County == "Uinta", "Uintah", County),
         County = ifelse(State == "Alabama" & County == "Winston", "Winston/Hancock", County),
         County = ifelse(State == "Kansas" & County == "Cherokee", "Cherokee/Mcghee", County),
         County = ifelse(State == "Kansas" & County == "Neosho", "Neosho/Dorn", County),
         County = ifelse(State == "Missouri" & County == "Cass", "Cass/Van Buren", County),
         County = ifelse(State == "Missouri" & County == "Henry", "Henry/Rives", County),
         County = ifelse(State == "South Dakota" & County == "Perkins", "Perkins/Spink", County),
         County = ifelse(State == "North Dakota" & County == "Williamson", "Williams", County),
         County = ifelse(State == "Utah" & County == "Grant", "Grand", County),
         County = ifelse(State == "Utah" & County == "San Pete", "Sanpete", County),
         County = ifelse(State == "Iowa" & County == "Green", "Greene", County),
         County = ifelse(State == "Ohio" & County == "Galia", "Gallia", County),
         County = ifelse(State == "Maryland" & County == "Allegheny", "Allegany", County)) %>%
  mutate(County = tolower(County)) %>% 
  left_join(.county_codes, by = c("State", "County"))
usgs_yearbooks_coal %>% filter(is.na(SCICP)) %>% distinct(State, County, .keep_all = T)

usgs_yearbooks_coal %<>% filter(Year %in% c(1900, 1910, 1922)) # identifying counties producing coal these years

usgs_yearbooks_coal_1900 <- usgs_yearbooks_coal %>% filter(Year == 1900 & Production > 0) %$% unique(SCICP) %>% as.character()
usgs_yearbooks_coal_1910 <- usgs_yearbooks_coal %>% filter(Year == 1910 & Production > 0) %$% unique(SCICP) %>% as.character()
usgs_yearbooks_coal_1920 <- usgs_yearbooks_coal %>% filter(Year == 1922 & Production > 0) %$% unique(SCICP) %>% as.character() # 1920 coal producing counties based on 1922 USGS report, due to missingness in 1920 report

county_changes_icspr <- read_delim("ICPSR_FILE_NAME_HERE.txt", #DATA NEEDED: ICPSR, County Longitudinal Template, 1840-1990 (ICPSR 6576) -> https://doi.org/10.3886/ICPSR06576.v1
                                   delim = " ") %>%
  mutate(SCICP = as.factor(paste0(as.numeric(stateicp), "-", as.numeric(countyicp))))
changes_to_exclude_0010 <- county_changes_icspr %>% # identifying counties that changed borders
  filter(change1910 == 1) %>%
  pull(SCICP) %>%
  as.character() %>%
  sort()
changes_to_exclude_1020 <- county_changes_icspr %>%
  filter(change1920 == 1) %>%
  pull(SCICP) %>%
  as.character() %>% sort()

usgs_yearbooks_coal_1900 <- usgs_yearbooks_coal_1900[! usgs_yearbooks_coal_1900 %in% changes_to_exclude_0010] # excluding counties that changed borders in a given decade
usgs_yearbooks_coal_1910 <- usgs_yearbooks_coal_1910[! usgs_yearbooks_coal_1910 %in% changes_to_exclude_1020] # excluding counties that changed borders in a given decade

rm(list = c("county_changes_icspr", "usgs_yearbooks_coal", "changes_to_exclude_0010", "changes_to_exclude_1020"))

# > Complete-count ("Full Count") census records ####

c1900 <- fread("1900_FULL_COUNT_FILE_HERE",
               select = c(HISTID = "factor",
                          STATEICP = "character",
                          COUNTYICP = "character"))
c1900 <- c1900[, SCICP := paste0(STATEICP, "-", COUNTYICP)] # combining ICPSR codes for state and county
c1900 <- c1900[SCICP %in% usgs_yearbooks_coal_1900] # limiting complete-count records to coal producing counties
c1900 <- c1900[, STATEICP := NULL]
c1900 <- c1900[, COUNTYICP := NULL]

c1910 <- fread("1910_FULL_COUNT_FILE_HERE",
               select = c(HISTID = "factor",
                          STATEICP = "character",
                          COUNTYICP = "character"))
c1910 <- c1910[, SCICP := paste0(STATEICP, "-", COUNTYICP)]
c1910 <- c1910[SCICP %in% usgs_yearbooks_coal_1910]
c1910 <- c1910[, STATEICP := NULL]
c1910 <- c1910[, COUNTYICP := NULL]

# > MLP Crosswalks ####

mlp0010 <- fread("ORIGINAL_MLP_FILE_FOR_1900-10_HERE")
mlp0010 <- mlp0010[histid_1900 %in% c1900$HISTID] # limiting MLP individuals to those in census dataset (in coal producing county)
mlp0010 <- mlp0010[, c("histid_1900", "histid_1910", "round"), with = FALSE]
fwrite(mlp0010, "mlp0010.csv")
rm(c1900)

mlp1020 <- fread("ORIGINAL_MLP_FILE_FOR_1910-20_HERE")
mlp1020 <- mlp1020[histid_1910 %in% c1910$HISTID]
mlp1020 <- mlp1020[, c("histid_1910", "histid_1920", "round"), with = FALSE]
fwrite(mlp1020, "mlp1020.csv")
rm(c1910)

# > Weight comparison datasets ####
# -- Note: this section based on code made available by the Census Linking Project.
# ---- Ran Abramitzky, Leah Boustan, Katherine Eriksson, Santiago Pérez and Myera Rashid. Census Linking Project: Version 2.0 [dataset]. 2020. https://censuslinkingproject.org

# >> 1900 ####

c1900 <- fread("1900_FULL_COUNT_FILE_HERE",
               select = c(HISTID = "character",
                          STATEICP = "character",
                          COUNTYICP = "character",
                          SEX = "integer",
                          AGE = "integer",
                          OCCSCORE = "integer",
                          LIT = "integer",
                          URBAN = "integer",
                          CITIZEN = "integer",
                          BPL = "integer"))
c1900 <- c1900[HISTID != ""]
c1900 <- unique(c1900, by = "HISTID")
c1900 <- c1900[, SCICP := paste0(STATEICP, "-", COUNTYICP)]
c1900 <- c1900[SCICP %in% usgs_yearbooks_coal_1900]

mlp0010 <- mlp0010[, matched := 1]

c1900 <- merge(x = c1900,
               y = mlp0010,
               by.x = "HISTID",
               by.y = "histid_1900",
               all.x = T)

c1900 <- c1900[, age_bins := 0]
c1900 <- c1900[AGE %in% 10:19, age_bins := 1]
c1900 <- c1900[AGE %in% 20:29, age_bins := 2]
c1900 <- c1900[AGE %in% 30:39, age_bins := 3]
c1900 <- c1900[AGE %in% 40:49, age_bins := 4]
c1900 <- c1900[AGE %in% 50:59, age_bins := 5]
c1900 <- c1900[AGE %in% 60:69, age_bins := 6]
c1900 <- c1900[AGE %in% 70:79, age_bins := 7]
c1900 <- c1900[AGE %in% 80:89, age_bins := 8]
c1900 <- c1900[AGE >= 90, age_bins := 9]

c1900 <- c1900[, occscore_bins := 0]
c1900 <- c1900[OCCSCORE %in% 10:19, occscore_bins := 1]
c1900 <- c1900[OCCSCORE %in% 20:29, occscore_bins := 2]
c1900 <- c1900[OCCSCORE %in% 30:39, occscore_bins := 3]
c1900 <- c1900[OCCSCORE %in% 40:49, occscore_bins := 4]
c1900 <- c1900[OCCSCORE >= 50, occscore_bins := 5]

c1900 <- c1900[, male_dummy := ifelse(SEX == 1, 1, 0)]
c1900 <- c1900[, lit_dummy := ifelse(LIT == 4, 1, 0)]
c1900 <- c1900[, urban_dummy := ifelse(URBAN == 2, 1, 0)]
c1900 <- c1900[, immig_dummy := ifelse(CITIZEN %in% 2:4, 1, 0)]

c1900 <- c1900[, age_bins := as.factor(age_bins)]
c1900 <- c1900[, occscore_bins := as.factor(occscore_bins)]
c1900 <- c1900[, male_dummy := as.factor(male_dummy)]
c1900 <- c1900[, lit_dummy := as.factor(lit_dummy)]
c1900 <- c1900[, urban_dummy := as.factor(urban_dummy)]
c1900 <- c1900[, immig_dummy := as.factor(immig_dummy)]

c1900_probit <- glm(matched ~ age_bins + occscore_bins + male_dummy + lit_dummy + urban_dummy + immig_dummy,
                    family = binomial(link = "probit"), 
                    data = c1900)
c1900_probit_predict <- data.table(predict = predict(c1900_probit, type = "response"))
c1900_probit_predict <- c1900_probit_predict[, weight := (1 - predict)/predict]

c1900 <- cbind(c1900, c1900_probit_predict)
c1900_matched_mean <- mean(c1900$matched)
c1900 <- c1900[, weight2 := weight*c1900_matched_mean/(1-c1900_matched_mean)]
c1900 <- c1900[matched == 0, weight := 1]
c1900 <- c1900[matched == 0, weight2 := 1]

c1900_export <- c1900[, c("HISTID", "weight", "weight2"), with = FALSE]
colnames(c1900_export)[2:3] <- c("weight_coal", "weight2_coal")
fwrite(c1900_export, "mlp0010_weights.csv")
rm(list = c("c1900", "c1900_export", "c1900_probit", "c1900_probit_predict"))

# >> 1910 ####

c1910 <- fread("1910_FULL_COUNT_FILE_HERE",
               select = c(HISTID = "character",
                          STATEICP = "character",
                          COUNTYICP = "character",
                          SEX = "integer",
                          AGE = "integer",
                          OCCSCORE = "integer",
                          LIT = "integer",
                          URBAN = "integer",
                          CITIZEN = "integer",
                          BPL = "integer"))
c1910 <- c1910[HISTID != ""]
c1910 <- unique(c1910, by = "HISTID")
c1910 <- c1910[, SCICP := paste0(STATEICP, "-", COUNTYICP)]
c1910 <- c1910[SCICP %in% usgs_yearbooks_coal_1910]

mlp1020 <- mlp1020[, matched := 1]

c1910 <- merge(x = c1910,
               y = mlp1020,
               by.x = "HISTID",
               by.y = "histid_1910",
               all.x = T)

c1910 <- c1910[, age_bins := 0]
c1910 <- c1910[AGE %in% 10:19, age_bins := 1]
c1910 <- c1910[AGE %in% 20:29, age_bins := 2]
c1910 <- c1910[AGE %in% 30:39, age_bins := 3]
c1910 <- c1910[AGE %in% 40:49, age_bins := 4]
c1910 <- c1910[AGE %in% 50:59, age_bins := 5]
c1910 <- c1910[AGE %in% 60:69, age_bins := 6]
c1910 <- c1910[AGE %in% 70:79, age_bins := 7]
c1910 <- c1910[AGE %in% 80:89, age_bins := 8]
c1910 <- c1910[AGE >= 90, age_bins := 9]

c1910 <- c1910[, occscore_bins := 0]
c1910 <- c1910[OCCSCORE %in% 10:19, occscore_bins := 1]
c1910 <- c1910[OCCSCORE %in% 20:29, occscore_bins := 2]
c1910 <- c1910[OCCSCORE %in% 30:39, occscore_bins := 3]
c1910 <- c1910[OCCSCORE %in% 40:49, occscore_bins := 4]
c1910 <- c1910[OCCSCORE >= 50, occscore_bins := 5]

c1910 <- c1910[, male_dummy := ifelse(SEX == 1, 1, 0)]
c1910 <- c1910[, lit_dummy := ifelse(LIT == 4, 1, 0)]
c1910 <- c1910[, urban_dummy := ifelse(URBAN == 2, 1, 0)]
c1910 <- c1910[, immig_dummy := ifelse(CITIZEN %in% 2:4, 1, 0)]

c1910 <- c1910[, age_bins := as.factor(age_bins)]
c1910 <- c1910[, occscore_bins := as.factor(occscore_bins)]
c1910 <- c1910[, male_dummy := as.factor(male_dummy)]
c1910 <- c1910[, lit_dummy := as.factor(lit_dummy)]
c1910 <- c1910[, urban_dummy := as.factor(urban_dummy)]
c1910 <- c1910[, immig_dummy := as.factor(immig_dummy)]

c1910_probit <- glm(matched ~ age_bins + occscore_bins + male_dummy + lit_dummy + urban_dummy + immig_dummy,
                    family = binomial(link = "probit"), 
                    data = c1910)
c1910_probit_predict <- data.table(predict = predict(c1910_probit, type = "response"))
c1910_probit_predict <- c1910_probit_predict[, weight := (1 - predict)/predict]

c1910 <- cbind(c1910, c1910_probit_predict)
c1910_matched_mean <- mean(c1910$matched)
c1910 <- c1910[, weight2 := weight*c1910_matched_mean/(1-c1910_matched_mean)]
c1910 <- c1910[matched == 0, weight := 1]
c1910 <- c1910[matched == 0, weight2 := 1]

c1910_export <- c1910[, c("HISTID", "weight", "weight2"), with = F]
colnames(c1910_export)[2:3] <- c("weight_coal", "weight2_coal")
fwrite(c1910_export, "mlp1020_weights.csv")
rm(list = c("c1910", "c1910_export", "c1910_probit", "c1910_probit_predict"))

# *************************************** #
######################################### #
#### *** DATA LOADING AND CLEANING *** ####
######################################### #
# *************************************** #

# > Main linkage skeleton ####

mlp0010 <- fread("mlp0010.csv")
mlp1020 <- fread("mlp1020.csv")

mlp0010 <- mlp0010[, RANGE := 10] # identifying 1900-10 range
mlp1020 <- mlp1020[, RANGE := 1020] # identifying 1910-20 range

mlp0010_weights <- fread("mlp0010_weights.csv")
mlp1020_weights <- fread("mlp1020_weights.csv")
mlp0010 <- merge.data.table(x = mlp0010,
                            y = mlp0010_weights,
                            by.x = "histid_1900",
                            by.y = "HISTID",
                            all.x = TRUE)
mlp1020 <- merge.data.table(x = mlp1020,
                            y = mlp1020_weights,
                            by.x = "histid_1910",
                            by.y = "HISTID",
                            all.x = TRUE)

colnames(mlp0010)[1:2] <- c("HISTID_START", "HISTID_END")
colnames(mlp1020)[1:2] <- c("HISTID_START", "HISTID_END")

mlp <- rbindlist(l = list(mlp0010, mlp1020))

# > Census covariates ####

# >> 1940 income data ####
# - Note: to calculate yourself, 1940 Full Count census must be obtained from IPUMS USA directly.
# - Skip commented lines to avoid calculating income estimates yourself.

#census1940 <- fread("1940_FULL_COUNT_FILE_HERE", select = c("IND1950",
#                                                            "OCC1950",
#                                                            "EMPSTAT",
#                                                            "RACE",
#                                                            "BPL",
#                                                            "INCWAGE",
#                                                            "STATEICP",
#                                                            "AGE",
#                                                            "SEX"))

#census1940_incomes <- census1940[IND1950 %in% 206:976 & OCC1950 %in% 0:970 & EMPSTAT == 1 & BPL %in% c(400:499)] # limiting to European immigrants outside of agriculture
#census1940_incomes <- census1940_incomes[, INCWAGE := ifelse(INCWAGE == 999998 | INCWAGE == 0, NA, INCWAGE)]

#census1940_incomes <- census1940_incomes[, OCC1950 := as.factor(OCC1950)]
#census1940_incomes <- census1940_incomes[, BPL := as.factor(BPL)]
#census1940_incomes <- census1940_incomes[, RACE := as.factor(RACE)]
#census1940_incomes <- census1940_incomes[, SEX := as.factor(SEX)]

#census1940_incomes_strata <- census1940_incomes[, .(INCWAGE_MEAN = mean(INCWAGE, na.rm = TRUE),
#                                                    INCWAGE_MEDIAN = median(INCWAGE, na.rm = TRUE)), 
#                                                by = list(STATEICP, OCC1950, BPL, RACE, SEX)]
#fwrite(census1940_incomes_strata, "census1940_incomes_strata.csv")

census1940_incomes_strata <- fread("census1940_incomes_strata.csv") # estimates of income based on 1940 census

# >> Complete-count IPUMS data, merged in by HISTID ####

ipums_col_set01 <- c(HISTID = "factor",
                     SERIAL = "integer",
                     RELATE = "integer",
                     STATEICP = "integer",
                     COUNTYICP = "integer",
                     URBAN = "integer",
                     FARM = "integer",
                     SEX = "integer",
                     AGE = "integer",
                     RACE = "integer",
                     HISPAN = "integer",
                     MTONGUE = "integer",
                     MTONGUED = "integer",
                     BPL = "integer",
                     BPLD = "integer",
                     FBPL = "integer",
                     MBPL = "integer",
                     CITIZEN = "integer",
                     YRIMMIG = "integer",
                     OCC1950 = "integer",
                     IND1950 = "integer",
                     LIT = "integer")

# >>> 1900 census ####

c1900 <- fread("1900_FULL_COUNT_CENSUS_FILE_HERE",
               select = ipums_col_set01)

c1900 <- c1900[, GROUP := ethnicity(bpl = BPL, bpld = BPLD, mbpl = MBPL, fbpl = FBPL, race = RACE, hispan = HISPAN, mtongue = NA, mtongued = NA)]

c1900 <- merge.data.table(x = c1900,
                          y = census1940_incomes_strata,
                          by = c("STATEICP", "OCC1950", "BPL", "RACE", "SEX"),
                          all.x = T)

c1900 <- c1900[, SCICP := as.factor(paste0(STATEICP,"-",COUNTYICP))]

c1900 <- c1900[, `:=` (GROUP_COUNTY = .N), by = list(SCICP, GROUP)]
c1900 <- c1900[, `:=` (GROUP_COAL = sum(IND1950 == 216, na.rm = TRUE)), by = list(SCICP, GROUP)]
c1900 <- c1900[, `:=` (GROUP_WORKERS = sum(IND1950 %in% 105:976, na.rm = TRUE)), by = list(SCICP, GROUP)]
c1900 <- c1900[, `:=` (GROUP_MALE = sum(SEX == 1, na.rm = TRUE)), by = list(SCICP, GROUP)]
c1900 <- c1900[, `:=` (GROUP_INCWAGE_MEDIAN = median(INCWAGE_MEDIAN[IND1950 %in% 105:976], na.rm = TRUE)), by = list(SCICP, GROUP)]
c1900 <- c1900[, `:=` (TOTAL_COUNTY_COAL = sum(IND1950 == 216, na.rm = TRUE)), by = SCICP]
c1900 <- c1900[, `:=` (TOTAL_COUNTY = .N), by = SCICP]
c1900 <- c1900[, `:=` (TOTAL_COUNTY_WORKERS = sum(IND1950 %in% 105:976, na.rm = TRUE)), by = SCICP]
c1900 <- c1900[, `:=` (TOTAL_MALE_21_COUNTY = sum(SEX == 1 & AGE >= 21, na.rm = TRUE)), by = SCICP]
c1900 <- c1900[, `:=` (TOTAL_FEMALE_21_COUNTY = sum(SEX == 2 & AGE >= 21, na.rm = TRUE)), by = SCICP]
c1900 <- c1900[, `:=` (TOTAL_NWHITE_COUNTY = sum(GROUP == "nativewhite", na.rm = TRUE)), by = SCICP]
c1900 <- c1900[, `:=` (TOTAL_NBLACK_COUNTY = sum(GROUP == "nativeblack", na.rm = TRUE)), by = SCICP]
c1900 <- c1900[, `:=` (NWHITE_COAL = sum(IND1950 == 216 & GROUP == "nativewhite", na.rm = TRUE)), by = SCICP]
c1900 <- c1900[, `:=` (NBLACK_COAL = sum(IND1950 == 216 & GROUP == "nativeblack", na.rm = TRUE)), by = SCICP]
c1900 <- c1900[, `:=` (RURAL_COUNTY = sum(URBAN == 1, na.rm = TRUE)), by = SCICP]

c1900 <- c1900[, RANGE := 10]

mlp0010 <- mlp[RANGE == 10]
mlp0010 <- merge.data.table(x = mlp0010,
                            y = c1900,
                            by.x = c("HISTID_START", "RANGE"),
                            by.y = c("HISTID", "RANGE"))

rm(c1900)

# >>> 1910 census ####

c1910 <- fread("1910_FULL_COUNT_CENSUS_FILE_HERE",
               select = ipums_col_set01)

c1910 <- c1910[, GROUP := ethnicity(bpl = BPL, bpld = BPLD, mbpl = MBPL, fbpl = FBPL, race = RACE, hispan = HISPAN, mtongue = NA, mtongued = NA)]

c1910 <- merge.data.table(x = c1910,
                          y = census1940_incomes_strata,
                          by = c("STATEICP", "OCC1950", "BPL", "RACE", "SEX"),
                          all.x = T)

c1910 <- c1910[, SCICP := as.factor(paste0(STATEICP,"-",COUNTYICP))]

c1910 <- c1910[, `:=` (GROUP_COUNTY = .N), by = list(SCICP, GROUP)]
c1910 <- c1910[, `:=` (GROUP_COAL = sum(IND1950 == 216, na.rm = TRUE)), by = list(SCICP, GROUP)]
c1910 <- c1910[, `:=` (GROUP_WORKERS = sum(IND1950 %in% 105:976, na.rm = TRUE)), by = list(SCICP, GROUP)]
c1910 <- c1910[, `:=` (GROUP_MALE = sum(SEX == 1, na.rm = TRUE)), by = list(SCICP, GROUP)]
c1910 <- c1910[, `:=` (GROUP_INCWAGE_MEDIAN = median(INCWAGE_MEDIAN[IND1950 %in% 105:976], na.rm = TRUE)), by = list(SCICP, GROUP)]
c1910 <- c1910[, `:=` (TOTAL_COUNTY_COAL = sum(IND1950 == 216, na.rm = TRUE)), by = SCICP]
c1910 <- c1910[, `:=` (TOTAL_COUNTY = .N), by = SCICP]
c1910 <- c1910[, `:=` (TOTAL_COUNTY_WORKERS = sum(IND1950 %in% 105:976, na.rm = TRUE)), by = SCICP]
c1910 <- c1910[, `:=` (TOTAL_MALE_21_COUNTY = sum(SEX == 1 & AGE >= 21, na.rm = TRUE)), by = SCICP]
c1910 <- c1910[, `:=` (TOTAL_FEMALE_21_COUNTY = sum(SEX == 2 & AGE >= 21, na.rm = TRUE)), by = SCICP]
c1910 <- c1910[, `:=` (TOTAL_NWHITE_COUNTY = sum(GROUP == "nativewhite", na.rm = TRUE)), by = SCICP]
c1910 <- c1910[, `:=` (TOTAL_NBLACK_COUNTY = sum(GROUP == "nativeblack", na.rm = TRUE)), by = SCICP]
c1910 <- c1910[, `:=` (NWHITE_COAL = sum(IND1950 == 216 & GROUP == "nativewhite", na.rm = TRUE)), by = SCICP]
c1910 <- c1910[, `:=` (NBLACK_COAL = sum(IND1950 == 216 & GROUP == "nativeblack", na.rm = TRUE)), by = SCICP]
c1910 <- c1910[, `:=` (RURAL_COUNTY = sum(URBAN == 1, na.rm = TRUE)), by = SCICP]

mlp0010 <- merge.data.table(x = mlp0010,
                            y = c1910,
                            by.x = "HISTID_END",
                            by.y = "HISTID",
                            suffixes = c("_START", "_END"))

mlp1020 <- mlp[RANGE == 1020]
mlp1020 <- merge.data.table(x = mlp1020,
                            y = c1910,
                            by.x = "HISTID_START",
                            by.y = "HISTID")

rm(c1910)

# >>> 1920 census ####

c1920 <- fread("1920_FULL_COUNT_CENSUS_FILE_HERE",
               select = ipums_col_set01)

c1920 <- c1920[, GROUP := ethnicity(bpl = BPL, bpld = BPLD, mbpl = MBPL, fbpl = FBPL, race = RACE, hispan = HISPAN, mtongue = NA, mtongued = NA)]
c1920 <- c1920[, GROUP := as.factor(GROUP)]

c1920 <- merge.data.table(x = c1920,
                          y = census1940_incomes_strata,
                          by = c("STATEICP", "OCC1950", "BPL", "RACE", "SEX"),
                          all.x = T)

c1920 <- c1920[, SCICP := as.factor(paste0(STATEICP,"-",COUNTYICP))]

c1920 <- c1920[, `:=` (GROUP_COUNTY = .N), by = list(SCICP, GROUP)]
c1920 <- c1920[, `:=` (GROUP_COAL = sum(IND1950 == 216, na.rm = TRUE)), by = list(SCICP, GROUP)]
c1920 <- c1920[, `:=` (GROUP_WORKERS = sum(IND1950 %in% 105:976, na.rm = TRUE)), by = list(SCICP, GROUP)]
c1920 <- c1920[, `:=` (GROUP_MALE = sum(SEX == 1, na.rm = TRUE)), by = list(SCICP, GROUP)]
c1920 <- c1920[, `:=` (GROUP_INCWAGE_MEDIAN = median(INCWAGE_MEDIAN[IND1950 %in% 105:976], na.rm = TRUE)), by = list(SCICP, GROUP)]
c1920 <- c1920[, `:=` (TOTAL_COUNTY_COAL = sum(IND1950 == 216, na.rm = TRUE)), by = SCICP]
c1920 <- c1920[, `:=` (TOTAL_COUNTY = .N), by = SCICP]
c1920 <- c1920[, `:=` (TOTAL_COUNTY_WORKERS = sum(IND1950 %in% 105:976, na.rm = TRUE)), by = SCICP]
c1920 <- c1920[, `:=` (TOTAL_MALE_21_COUNTY = sum(SEX == 1 & AGE >= 21, na.rm = TRUE)), by = SCICP]
c1920 <- c1920[, `:=` (TOTAL_FEMALE_21_COUNTY = sum(SEX == 2 & AGE >= 21, na.rm = TRUE)), by = SCICP]
c1920 <- c1920[, `:=` (TOTAL_NWHITE_COUNTY = sum(GROUP == "nativewhite", na.rm = TRUE)), by = SCICP]
c1920 <- c1920[, `:=` (TOTAL_NBLACK_COUNTY = sum(GROUP == "nativeblack", na.rm = TRUE)), by = SCICP]
c1920 <- c1920[, `:=` (NWHITE_COAL = sum(IND1950 == 216 & GROUP == "nativewhite", na.rm = TRUE)), by = SCICP]
c1920 <- c1920[, `:=` (NBLACK_COAL = sum(IND1950 == 216 & GROUP == "nativeblack", na.rm = TRUE)), by = SCICP]
c1920 <- c1920[, `:=` (RURAL_COUNTY = sum(URBAN == 1, na.rm = TRUE)), by = SCICP]

mlp1020 <- merge.data.table(x = mlp1020,
                            y = c1920,
                            by.x = "HISTID_END",
                            by.y = "HISTID",
                            suffixes = c("_START", "_END"))

# > Binding into single linkage dataset ####

mlp_full <- rbindlist(l = list(mlp0010,
                               mlp1020),
                      fill = TRUE)

rm(list = c("mlp0010", "mlp1020"))

# > Excluding counties that change borders ####
# - Loading ICPSR, County Longitudinal Template, 1840-1990 (ICPSR 6576)

county_changes_icspr <- read_delim("ICPSR_FILE_NAME_HERE.txt",
                                   delim = " ") %>%
  mutate(SCICP = as.factor(paste0(as.numeric(stateicp), "-", as.numeric(countyicp))))

changes_to_exclude_0010 <- county_changes_icspr %>%
  filter(change1910 == 1) %>%
  pull(SCICP) %>%
  as.character() %>%
  sort()
changes_to_exclude_1020 <- county_changes_icspr %>%
  filter(change1920 == 1) %>%
  pull(SCICP) %>%
  as.character() %>% sort()
changes_to_exclude_2030 <- county_changes_icspr %>%
  filter(change1930 == 1) %>%
  pull(SCICP) %>%
  as.character() %>% sort()

counties_to_exclude <- data.table(SCICP = c(changes_to_exclude_0010,
                                            changes_to_exclude_1020,
                                            changes_to_exclude_2030),
                                  RANGE = c(rep(10, length(changes_to_exclude_0010)),
                                            rep(1020, length(changes_to_exclude_1020)),
                                            rep(2030, length(changes_to_exclude_2030))))

counties_to_exclude <- counties_to_exclude[, COUNTY_BORDER_CHANGE := TRUE]

mlp_full <- merge.data.table(x = mlp_full,
                             y = counties_to_exclude,
                             by.x = c("SCICP_START", "RANGE"),
                             by.y = c("SCICP", "RANGE"),
                             all.x = TRUE)
mlp_full <- merge.data.table(x = mlp_full,
                             y = counties_to_exclude,
                             by.x = c("SCICP_END", "RANGE"),
                             by.y = c("SCICP", "RANGE"),
                             all.x = TRUE,
                             suffixes = c("_START", "_END"))

mlp_full <- mlp_full[is.na(COUNTY_BORDER_CHANGE_START) & is.na(COUNTY_BORDER_CHANGE_END)]

mlp_full <- mlp_full[, COUNTY_BORDER_CHANGE_START := NULL]
mlp_full <- mlp_full[, COUNTY_BORDER_CHANGE_END := NULL]

rm(list = c("counties_to_exclude", "county_changes_icspr"))

# > Bartik estimation of general economic conditions ####
# - Skip bracketed code block to load data at line 848: bartik_dt2 <- fread("bartik_dt.csv")

{
  # Complete-Count Censuses
  
  c1900 <- fread("1900_FULL_COUNT_FILE_HERE",
                 select = c(STATEICP = "character",
                            COUNTYICP = "character",
                            IND1950 = "integer"))
  c1900 <- c1900[, SCICP := paste0(STATEICP, "-", COUNTYICP)]
  c1900 <- c1900[, STATEICP := NULL]
  c1900 <- c1900[, COUNTYICP := NULL]
  
  c1910 <- fread("1910_FULL_COUNT_FILE_HERE",
                 select = c(STATEICP = "character",
                            COUNTYICP = "character",
                            IND1950 = "integer"))
  c1910 <- c1910[, SCICP := paste0(STATEICP, "-", COUNTYICP)]
  c1910 <- c1910[, STATEICP := NULL]
  c1910 <- c1910[, COUNTYICP := NULL]
  
  c1920 <- fread("1920_FULL_COUNT_FILE_HERE",
                 select = c(STATEICP = "character",
                            COUNTYICP = "character",
                            IND1950 = "integer"))
  c1920 <- c1920[, SCICP := paste0(STATEICP, "-", COUNTYICP)]
  c1920 <- c1920[, STATEICP := NULL]
  c1920 <- c1920[, COUNTYICP := NULL]
  
  # Categorizing IND1950
  
  c1900 <- c1900[IND1950 %in% 105:976]
  c1910 <- c1910[IND1950 %in% 105:976]
  c1920 <- c1920[IND1950 %in% 105:976]
  
  durable <- 306:399
  nondurable <- 406:499
  agriculture <- 105
  petroleum <- 226
  nonmetals <- 236
  metals <- 206
  
  c1900 <- c1900[, DURABLE := ifelse(IND1950 %in% durable, 1, 0)]
  c1900 <- c1900[, NONDURABLE := ifelse(IND1950 %in% nondurable, 1, 0)]
  c1900 <- c1900[, AGRI := ifelse(IND1950 %in% agriculture, 1, 0)]
  c1900 <- c1900[, PETRO := ifelse(IND1950 %in% petroleum, 1, 0)]
  c1900 <- c1900[, NONMETALS := ifelse(IND1950 %in% nonmetals, 1, 0)]
  c1900 <- c1900[, METALS := ifelse(IND1950 %in% metals, 1, 0)]
  
  c1910 <- c1910[, DURABLE := ifelse(IND1950 %in% durable, 1, 0)]
  c1910 <- c1910[, NONDURABLE := ifelse(IND1950 %in% nondurable, 1, 0)]
  c1910 <- c1910[, AGRI := ifelse(IND1950 %in% agriculture, 1, 0)]
  c1910 <- c1910[, PETRO := ifelse(IND1950 %in% petroleum, 1, 0)]
  c1910 <- c1910[, NONMETALS := ifelse(IND1950 %in% nonmetals, 1, 0)]
  c1910 <- c1910[, METALS := ifelse(IND1950 %in% metals, 1, 0)]
  
  c1920 <- c1920[, DURABLE := ifelse(IND1950 %in% durable, 1, 0)]
  c1920 <- c1920[, NONDURABLE := ifelse(IND1950 %in% nondurable, 1, 0)]
  c1920 <- c1920[, AGRI := ifelse(IND1950 %in% agriculture, 1, 0)]
  c1920 <- c1920[, PETRO := ifelse(IND1950 %in% petroleum, 1, 0)]
  c1920 <- c1920[, NONMETALS := ifelse(IND1950 %in% nonmetals, 1, 0)]
  c1920 <- c1920[, METALS := ifelse(IND1950 %in% metals, 1, 0)]
  
  c1900a <- c1900[, .(DURABLE = mean(DURABLE, na.rm = TRUE),
                      NONDURABLE = mean(NONDURABLE, na.rm = TRUE),
                      AGRI = mean(AGRI, na.rm = TRUE),
                      PETRO = mean(PETRO, na.rm = TRUE),
                      NONMETALS = mean(NONMETALS, na.rm = TRUE),
                      METALS = mean(METALS, na.rm = TRUE)), by = SCICP]
  c1910a <- c1910[, .(DURABLE = mean(DURABLE, na.rm = TRUE),
                      NONDURABLE = mean(NONDURABLE, na.rm = TRUE),
                      AGRI = mean(AGRI, na.rm = TRUE),
                      PETRO = mean(PETRO, na.rm = TRUE),
                      NONMETALS = mean(NONMETALS, na.rm = TRUE),
                      METALS = mean(METALS, na.rm = TRUE)), by = SCICP]
  c1920a <- c1920[, .(DURABLE = mean(DURABLE, na.rm = TRUE),
                      NONDURABLE = mean(NONDURABLE, na.rm = TRUE),
                      AGRI = mean(AGRI, na.rm = TRUE),
                      PETRO = mean(PETRO, na.rm = TRUE),
                      NONMETALS = mean(NONMETALS, na.rm = TRUE),
                      METALS = mean(METALS, na.rm = TRUE)), by = SCICP]
  
  c1900a <- c1900a[, RANGE := 10]
  c1910a <- c1910a[, RANGE := 1020]
  c1920a <- c1920a[, RANGE := 2030]
  
  bartik_dt <- rbindlist(l = list(c1900a,
                                  c1910a,
                                  c1920a))
  
  # Bartik forcers
  
  bartik_yearly <- read_excel("bartik.xlsx") %>% # non-coal production data
    arrange(year)
  
  bartik_yearly2 <- bartik_yearly %>%
    mutate_at(vars(-year), ~ (. - dplyr::lag(., 1))/dplyr::lag(., 1)) %>%
    mutate(RANGE = case_when(year %in% 1900:1909 ~ 10,
                             year %in% 1910:1919 ~ 1020,
                             year %in% 1920:1929 ~ 2030)) %>%
    group_by(RANGE) %>%
    summarize_at(vars(-year), ~ sum(.[. < 0], na.rm = TRUE)) %>%
    ungroup()
  setDT(bartik_yearly2)
  
  bartik_yearly2_growth <- bartik_yearly %>%
    mutate_at(vars(-year), ~ (. - dplyr::lag(., 1))/dplyr::lag(., 1)) %>%
    mutate(RANGE = case_when(year %in% 1900:1909 ~ 10,
                             year %in% 1910:1919 ~ 1020,
                             year %in% 1920:1929 ~ 2030)) %>%
    group_by(RANGE) %>%
    summarize_at(vars(-year), ~ sum(.[. > 0], na.rm = TRUE)) %>%
    ungroup()
  setDT(bartik_yearly2_growth)
  
  bartik_dt <- merge.data.table(x = bartik_dt,
                                y = bartik_yearly2,
                                by = "RANGE",
                                all.x = TRUE)
  bartik_dt <- merge.data.table(x = bartik_dt,
                                y = bartik_yearly2_growth,
                                by = "RANGE",
                                suffixes = c("", "_growth"),
                                all.x = TRUE)
  
  bartik_dt <- bartik_dt[, DURABLE_BARTIK := DURABLE*manuf_durable_value]
  bartik_dt <- bartik_dt[, NONDURABLE_BARTIK := NONDURABLE*manuf_allnondur_value]
  bartik_dt <- bartik_dt[, AGRI_BARTIK := AGRI*agri_wholesale_price_index]
  bartik_dt <- bartik_dt[, PETRO_BARTIK := PETRO*petroleum_value]
  bartik_dt <- bartik_dt[, NONMETALS_BARTIK := NONMETALS*nonmetals_value]
  bartik_dt <- bartik_dt[, METALS_BARTIK := METALS*metals_value]
  
  bartik_dt <- bartik_dt[, DURABLE_BARTIK_GROWTH := DURABLE*manuf_durable_value_growth]
  bartik_dt <- bartik_dt[, NONDURABLE_BARTIK_GROWTH := NONDURABLE*manuf_allnondur_value_growth]
  bartik_dt <- bartik_dt[, AGRI_BARTIK_GROWTH := AGRI*agri_wholesale_price_index_growth]
  bartik_dt <- bartik_dt[, PETRO_BARTIK_GROWTH := PETRO*petroleum_value_growth]
  bartik_dt <- bartik_dt[, NONMETALS_BARTIK_GROWTH := NONMETALS*nonmetals_value_growth]
  bartik_dt <- bartik_dt[, METALS_BARTIK_GROWTH := METALS*metals_value_growth]
  
  bartik_dt_decline <- bartik_dt[, .(BARTIK_MEAN = rowMeans(.SD)), by = list(SCICP, RANGE),
                                 .SDcols = c("DURABLE_BARTIK",
                                             "NONDURABLE_BARTIK",
                                             "AGRI_BARTIK",
                                             "PETRO_BARTIK",
                                             "NONMETALS_BARTIK",
                                             "METALS_BARTIK"
                                 )]
  
  bartik_dt_growth <- bartik_dt[, .(BARTIK_MEAN_GROWTH = rowMeans(.SD)), by = list(SCICP, RANGE),
                                .SDcols = c("DURABLE_BARTIK_GROWTH",
                                            "NONDURABLE_BARTIK_GROWTH",
                                            "AGRI_BARTIK_GROWTH",
                                            "PETRO_BARTIK_GROWTH",
                                            "NONMETALS_BARTIK_GROWTH",
                                            "METALS_BARTIK_GROWTH"
                                )]
  
  bartik_dt <- merge.data.table(x = bartik_dt_decline,
                                y = bartik_dt_growth,
                                by = c("SCICP", "RANGE"))
  
  bartik_dt2 <- bartik_dt %>%
    select(RANGE, SCICP, BARTIK_MEAN, BARTIK_MEAN_GROWTH) %>%
    rename(SCICP_START = SCICP)
  
  bartik_dt2 <- bartik_dt2[, BARTIK_MEAN_TOT := BARTIK_MEAN + BARTIK_MEAN_GROWTH]
  
  #fwrite(bartik_dt2, "bartik_dt.csv")
  
}

bartik_dt2 <- fread("bartik_dt.csv")

mlp_full <- merge.data.table(x = mlp_full,
                             y = bartik_dt2,
                             by = c("SCICP_START", "RANGE"),
                             all.x = TRUE)

mlp_full <- mlp_full[, BARTIK_MEAN_SQRT := sqrt(abs(BARTIK_MEAN))]
mlp_full <- mlp_full[, BARTIK_MEAN_GROWTH_SQRT := sqrt(abs(BARTIK_MEAN_GROWTH))]
mlp_full <- mlp_full[, BARTIK_MEAN_TOT_SQRT := sqrt(abs(BARTIK_MEAN_TOT))]

# >> Leading Bartik ####

bartik_lead <- copy(bartik_dt2)
bartik_lead <- bartik_lead[, RANGE := fcase(RANGE == 1020, 10,
                                            RANGE == 2030, 1020,
                                            RANGE == 10, NA_real_)]

mlp_full <- merge.data.table(x = mlp_full,
                             y = bartik_lead,
                             by = c("SCICP_START", "RANGE"),
                             all.x = TRUE,
                             suffixes = c("", "_LEAD"))
mlp_full <- mlp_full[, BARTIK_MEAN_LEAD_SQRT := sqrt(abs(BARTIK_MEAN_LEAD))]

rm(bartik_dt2); rm(bartik_lead)

# > Base measure of coal production ####

# >> Creating measures ####

usgs_yearbooks_coal <- read_excel("usgs_mineral_yearbooks.xlsx",
                                  sheet = "County production") %>%
  mutate(multiple_counties = str_count(County, "\\bAND\\b")) %>%
  mutate(multiple_counties = ifelse(is.na(multiple_counties), 0, multiple_counties))

usgs_yearbooks_coal <- usgs_yearbooks_coal[rep(seq_len(nrow(usgs_yearbooks_coal)), 
                                               usgs_yearbooks_coal$multiple_counties + 1), 
                                           1:ncol(usgs_yearbooks_coal)] %>% 
  group_by(Year, State, County) %>% 
  mutate(group_no = row_number()) %>%
  ungroup() %>%
  mutate(Production = replace_na(Production, 0)) %>% 
  mutate(Production = ifelse(multiple_counties == 0, Production, NA)) %>%
  mutate(Production = ifelse(Mineral == "anthracite" & Production == 0, NA, Production))

usgs_yearbooks_coal$group_id <- usgs_yearbooks_coal %>% group_indices(Year, State, County)

usgs_yearbooks_coal %<>%
  mutate(County = ifelse(group_no > 1, str_extract(County, "(?<=AND\\s)\\w[^-]*"), County)) %>%
  mutate(County = ifelse(group_no > 2, str_extract(County, "(?<=AND\\s)\\w[^-]*"), County)) %>%
  mutate(County = ifelse(group_no > 3, str_extract(County, "(?<=AND\\s)\\w[^-]*"), County)) %>%
  mutate(County = ifelse(group_no > 4, str_extract(County, "(?<=AND\\s)\\w[^-]*"), County)) %>%
  mutate(County = ifelse(group_no > 5, str_extract(County, "(?<=AND\\s)\\w[^-]*"), County)) %>%
  mutate(County = ifelse(group_no > 6, str_extract(County, "(?<=AND\\s)\\w[^-]*"), County)) %>%
  mutate(County = ifelse(group_no > 7, str_extract(County, "(?<=AND\\s)\\w[^-]*"), County)) %>%
  mutate(County = ifelse(group_no > 8, str_extract(County, "(?<=AND\\s)\\w[^-]*"), County)) %>%
  mutate(County = ifelse(group_no > 9, str_extract(County, "(?<=AND\\s)\\w[^-]*"), County)) %>%
  mutate(County = ifelse(group_no > 10, str_extract(County, "(?<=AND\\s)\\w[^-]*"), County)) %>%
  mutate(County = ifelse(group_no > 11, str_extract(County, "(?<=AND\\s)\\w[^-]*"), County)) %>%
  mutate(County = ifelse(group_no > 12, str_extract(County, "(?<=AND\\s)\\w[^-]*"), County)) %>%
  mutate(County = ifelse(group_no > 13, str_extract(County, "(?<=AND\\s)\\w[^-]*"), County)) %>%
  mutate(County = ifelse(group_no > 14, str_extract(County, "(?<=AND\\s)\\w[^-]*"), County)) %>%
  mutate(County = ifelse(group_no > 15, str_extract(County, "(?<=AND\\s)\\w[^-]*"), County)) %>%
  mutate(County = ifelse(group_no > 16, str_extract(County, "(?<=AND\\s)\\w[^-]*"), County)) %>%
  mutate(County = ifelse(group_no > 17, str_extract(County, "(?<=AND\\s)\\w[^-]*"), County)) %>%
  mutate(County = ifelse(group_no > 18, str_extract(County, "(?<=AND\\s)\\w[^-]*"), County)) %>%
  mutate(County = ifelse(group_no > 19, str_extract(County, "(?<=AND\\s)\\w[^-]*"), County)) %>%
  mutate(County = str_remove(County, " AND.*"))

usgs_yearbooks_coal %<>%
  mutate_at(vars(Production), ~ ./(multiple_counties + 1)) %>%
  dplyr::select(-group_no, -group_id)

.county_codes <- read_delim("icpsrcnt_pre1930.txt",
                            delim = "|") %>%
  mutate(SCICP = as.factor(paste0(STATEICP, "-", COUNTYICP))) %>%
  dplyr::select(State, County, SCICP) %>%
  mutate(County = tolower(County))

usgs_yearbooks_coal %<>% 
  mutate(County = ifelse(State == "Illinois" & County == "Marlon", "Marion", County),
         County = ifelse(State == "Illinois" & County == "Lasalle", "La Salle", County),
         County = ifelse(State == "Indiana" & (County == "Vanderburg" | County == "Vanderberg"), "Vanderburgh", County),
         County = ifelse(State == "Indiana" & County == "Vermilion", "Vermillion", County),
         County = ifelse(State == "Missouri" & County == "Charitan", "Chariton", County),
         County = ifelse(State == "Montana" & County == "Choteau", "Chouteau", County),
         County = ifelse(State == "Pennsylvania" & County == "Center", "Centre", County),
         County = ifelse(State == "Utah" & County == "Uinta", "Uintah", County),
         County = ifelse(State == "Alabama" & County == "Winston", "Winston/Hancock", County),
         County = ifelse(State == "Kansas" & County == "Cherokee", "Cherokee/Mcghee", County),
         County = ifelse(State == "Kansas" & County == "Neosho", "Neosho/Dorn", County),
         County = ifelse(State == "Missouri" & County == "Cass", "Cass/Van Buren", County),
         County = ifelse(State == "Missouri" & County == "Henry", "Henry/Rives", County),
         County = ifelse(State == "South Dakota" & County == "Perkins", "Perkins/Spink", County),
         County = ifelse(State == "North Dakota" & County == "Williamson", "Williams", County),
         County = ifelse(State == "Utah" & County == "Grant", "Grand", County),
         County = ifelse(State == "Utah" & County == "San Pete", "Sanpete", County),
         County = ifelse(State == "Iowa" & County == "Green", "Greene", County),
         County = ifelse(State == "Ohio" & County == "Galia", "Gallia", County),
         County = ifelse(State == "Maryland" & County == "Allegheny", "Allegany", County)) %>%
  mutate(County = tolower(County)) %>% 
  left_join(.county_codes, by = c("State", "County"))
usgs_yearbooks_coal %>% filter(is.na(SCICP)) %>% distinct(State, County, .keep_all = T) # checking county non-matches

usgs_yearbooks_coal %<>%
  mutate(decade_agg_year = case_when(Year %in% 1890:1899 ~ 1900,
                                     Year %in% 1900:1909 ~ 1910,
                                     Year %in% 1910:1919 ~ 1920,
                                     Year %in% 1920:1929 ~ 1930,
                                     Year %in% 1930:1939 ~ 1940,
                                     TRUE ~ NA_real_),
         pres_agg_year = round.choose(x = Year, roundTo = 4, dir = 1),
         cong_agg_year = round.choose(x = Year, roundTo = 2, dir = 1),
         decade_agg_year2 = case_when(Year %in% 1891:1900 ~ 1900,
                                      Year %in% 1901:1910 ~ 1910,
                                      Year %in% 1911:1920 ~ 1920,
                                      Year %in% 1921:1930 ~ 1930,
                                      Year %in% 1931:1940 ~ 1940,
                                      TRUE ~ NA_real_),
         pres_agg_year2 = round.choose(x = Year-1, roundTo = 4, dir = 1),
         cong_agg_year2 = round.choose(x = Year-1, roundTo = 2, dir = 1))

usgs_yearbooks_coal_agg <- usgs_yearbooks_coal %>%
  group_by(SCICP, Mineral) %>%
  arrange(SCICP, Mineral, Year) %>%
  mutate(Production_chng = (Production - lag(Production))/lag(Production))

coal_agg <- usgs_yearbooks_coal_agg %>%
  group_by(SCICP, decade_agg_year, Mineral) %>%
  mutate(decadeProduction_chng_neg_count = sum(Production_chng < 0, na.rm = T),
         decadeProduction_chng_neg_mean = abs(mean(Production_chng[Production_chng < 0], na.rm = T))) %>%
  mutate(decadeProduction_chng_neg_weighted = decadeProduction_chng_neg_count*decadeProduction_chng_neg_mean) %>%
  mutate(decadeProduction_chng_neg_weighted = ifelse(mean(Production_chng >= 0, # setting production that only grew to 0 (no decline)
                                                          na.rm = T) == 1, 0, decadeProduction_chng_neg_weighted)) %>% 
  ungroup() %>%
  mutate_at(vars(starts_with("decadeProduction_")), ~ replace(., . == "NaN", NA)) %>%
  mutate_at(vars(starts_with("decadeProduction_")), ~ replace(., . == Inf, NA)) %>%
  mutate(decade_year = as.logical(Year %in% seq(1900, 1930, 10))) %>%
  dplyr::select(-decadeProduction_chng_neg_count,
                -decadeProduction_chng_neg_mean)

#write_rds(coal_agg, "coal_agg.rds")

# >> Cleaning ####

coal_agg <- read_rds("coal_agg.rds") %>% # coal production data
  filter(multiple_counties == 0) %>% # excluding data points that cover more than one county
  group_by(SCICP, decade_agg_year) %>%
  mutate(Production_chng = ifelse(is.infinite(Production_chng), NA, Production_chng)) %>%
  summarize(base_decline_magnitude = mean(decadeProduction_chng_neg_weighted, na.rm = TRUE), # shock intensity measure
            base_growth_magnitude = sum(Production_chng[Production_chng > 0], na.rm = TRUE),
            mean_prod_chng = mean(Production_chng, na.rm = TRUE),
            sum_prod_chng = sum(abs(Production_chng), na.rm = TRUE)) %>%
  ungroup() %>%
  rename(RANGE = decade_agg_year) %>%
  mutate(RANGE = case_when(RANGE == 1910 ~ 10,
                           RANGE == 1920 ~ 1020,
                           RANGE == 1930 ~ 2030)) %>%
  filter(!is.na(RANGE))
setDT(coal_agg)

mlp_full <- merge.data.table(x = mlp_full,
                             y = coal_agg,
                             by.x = c("SCICP_START", "RANGE"),
                             by.y = c("SCICP", "RANGE"),
                             all.x = TRUE)

base_decline_magnitude_mean <- mean(coal_agg$base_decline_magnitude, na.rm = TRUE)
base_decline_magnitude_sd <- sd(coal_agg$base_decline_magnitude, na.rm = TRUE)

base_growth_magnitude_mean <- mean(coal_agg$base_growth_magnitude, na.rm = TRUE)
base_growth_magnitude_sd <- sd(coal_agg$base_growth_magnitude, na.rm = TRUE)

mlp_full <- mlp_full[, base_decline_magnitude_std := (base_decline_magnitude - base_decline_magnitude_mean)/base_decline_magnitude_sd]
mlp_full <- mlp_full[, base_decline_magnitude_sqrt := sqrt(base_decline_magnitude)]

mlp_full <- mlp_full[, base_growth_magnitude_std := (base_growth_magnitude - base_growth_magnitude_mean)/base_growth_magnitude_sd]
mlp_full <- mlp_full[, base_growth_magnitude_sqrt := sqrt(base_growth_magnitude)]

rm(coal_agg)

# > Dropping big strike years from decline magnitude ####

strike_years_usa <- 1919
strike_years_pa <- 1902
strike_years_westmoreland_pa <- 1910:1911
strike_years_kanawha_wv <- 1912:1913
strike_years_las_animas_co <- 1914

coal_agg_strikes <- read_rds("coal_agg.rds") %>%
  filter(multiple_counties == 0) %>%
  group_by(SCICP, decade_agg_year) %>%
  mutate(Production_chng = ifelse(is.infinite(Production_chng), NA, Production_chng)) %>%
  mutate(Production_chng = case_when(Year == 1919 ~ NA_real_,
                                     Year == 1902 & State == "Pennsylvania" ~ NA_real_,
                                     Year %in% 1910:1911 & State == "Pennsylvania" & County == "westmoreland" ~ NA_real_,
                                     Year %in% 1912:1913 & State == "West Virginia" & County == "kanawha" ~ NA_real_,
                                     Year == 1914 & State == "Colorado" & County == "las animas" ~ NA_real_,
                                     TRUE ~ Production_chng)) %>%
  summarize(base_decline_magnitude_noStrikes = sum(Production_chng[Production_chng < 0], na.rm = TRUE)) %>%
  ungroup() %>%
  rename(RANGE = decade_agg_year) %>%
  mutate(RANGE = case_when(RANGE == 1910 ~ 10,
                           RANGE == 1920 ~ 1020,
                           RANGE == 1930 ~ 2030)) %>%
  filter(!is.na(RANGE))
setDT(coal_agg_strikes)

mlp_full <- merge.data.table(x = mlp_full,
                             y = coal_agg_strikes,
                             by.x = c("SCICP_START", "RANGE"),
                             by.y = c("SCICP", "RANGE"),
                             all.x = TRUE)

mlp_full <- mlp_full[, base_decline_magnitude_noStrikes := -base_decline_magnitude_noStrikes]
mlp_full <- mlp_full[, base_decline_magnitude_noStrikes_sqrt := sqrt(base_decline_magnitude_noStrikes)]

# > Leading measure of coal production ####

coal_agg_lead <- read_rds("coal_agg.rds") %>%
  filter(multiple_counties == 0) %>%
  group_by(SCICP, decade_agg_year) %>%
  summarize(base_decline_magnitude = mean(decadeProduction_chng_neg_weighted, na.rm = TRUE)) %>%
  ungroup() %>%
  rename(RANGE = decade_agg_year) %>%
  mutate(RANGE = case_when(RANGE == 1910 ~ NA_real_,
                           RANGE == 1920 ~ 10,
                           RANGE == 1930 ~ 1020)) %>%
  filter(!is.na(RANGE))
setDT(coal_agg_lead)

mlp_full <- merge.data.table(x = mlp_full,
                             y = coal_agg_lead,
                             by.x = c("SCICP_START", "RANGE"),
                             by.y = c("SCICP", "RANGE"),
                             suffixes = c("","_LEAD"),
                             all.x = TRUE)

rm(coal_agg_lead)

mlp_full <- mlp_full[, base_decline_magnitude_LEAD_sqrt := sqrt(base_decline_magnitude_LEAD)]

# > Election data ####
# - NOTE: This section relies on data from ICPSR, which can be accessed here: https://doi.org/10.3886/ICPSR08611.v1 (download as Stata file)
# -- Electoral Data for Counties in the United States: Presidential and Congressional Races, 1840-1972 (ICPSR 8611)

icpsr8611 <- read_dta("ICPSR8611_FILE_HERE.dta") %>%
  mutate(SCICP = paste0(V1, "-", V3)) %>%
  dplyr::select(SCICP, everything()) %>%
  mutate_at(vars(V4:V756), ~ replace(., (. > 999 & . < 1000) | . == 9999999, NA))

icpsr8611_labels_holder <- lapply(icpsr8611, attr, "label")
icpsr8611_labels <- c()
for (i in 1:759){
  icpsr8611_labels[[i]] <- icpsr8611_labels_holder[[i+1]]
}
colnames(icpsr8611)[2:ncol(icpsr8611)] <- icpsr8611_labels
icpsr8611 <- clean_names(icpsr8611)

icpsr8611_l <- icpsr8611 %>%
  dplyr::select(scicp, x1840_pres_dem_percent:cong_dist_number_1972) %>%
  pivot_longer(cols = starts_with(c("x", "cong")),
               names_to = "election",
               values_to = "value") %>%
  mutate(year = case_when(substr(election, 1, 1) == "x" ~ substr(election, 2, 5),
                          substr(election, 1, 4) == "cong" ~ substr(election, 18, 21),
                          TRUE ~ NA_character_)) %>%
  mutate(election = ifelse(substr(election, 1, 1) == "x", substr(election, 7, nchar(election)), election)) %>%
  mutate(election = ifelse(substr(election, 1, 16) == "cong_dist_number", "cong_dist_number", election)) %>%
  pivot_wider(names_from = election,
              values_from = value) %>%
  dplyr::select(scicp, year,
                pres_dem_percent,
                pres_rep_percent,
                pres_ttl_vote,
                pres_t_o_percent,
                cong_dist_number,
                cong_dem_percent,
                cong_rep_percent,
                cong_ttl_vote,
                cong_t_o_percent,
                everything()) %>%
  mutate(pres_dem_percent = ifelse(is.na(pres_dem_percent), pres_dem, pres_dem_percent)) %>%
  filter(year %in% 1900:1930) %>%
  select_if(~ !all(is.na(.))) %>%
  mutate(cong_dem_count = cong_dem_percent * cong_ttl_vote,
         cong_rep_count = cong_rep_percent * cong_ttl_vote,
         cong_progressive_count = cong_progressive_percent * cong_ttl_vote) %>%
  mutate(cong_progressive_count = ifelse(is.na(cong_progressive_count), 0, cong_progressive_count)) %>%
  mutate(stateicp = str_extract(scicp, "^.*?(?=-)")) %>%
  group_by(stateicp, cong_dist_number, year) %>%
  mutate(cong_dem_count_total = sum(cong_dem_count),
         cong_rep_count_total = sum(cong_rep_count),
         cong_progressive_count_total = sum(cong_progressive_count)) %>%
  mutate(cong_winner = case_when(cong_dem_count_total > cong_rep_count_total & cong_dem_count_total > cong_progressive_count_total ~ "DEM",
                                 cong_rep_count_total > cong_dem_count_total & cong_rep_count_total > cong_progressive_count_total ~ "REP",
                                 cong_progressive_count_total > cong_dem_count_total & cong_progressive_count_total > cong_rep_count_total ~ "PRO",
                                 TRUE ~ NA_character_)) %>%
  ungroup() %>%
  group_by(stateicp, cong_dist_number) %>%
  mutate(cong_inc = dplyr::lag(cong_winner, n = 1)) %>%
  mutate(cong_inc_lost = as.logical(cong_inc != cong_winner)) %>%
  mutate(cong_inc_percent = case_when(cong_inc == "DEM" ~ cong_dem_percent,
                                      cong_inc == "REP" ~ cong_rep_percent,
                                      cong_inc == "PRO" ~ cong_progressive_percent,
                                      TRUE ~ NA_real_)) %>%
  mutate_at(vars(ends_with("_percent")), ~ ifelse(. == 0, NA, .)) %>%
  mutate(cong_inc_2party_percent = case_when(cong_inc == "DEM" ~ cong_dem_percent/(cong_dem_percent+cong_rep_percent),
                                             cong_inc == "REP" ~ cong_rep_percent/(cong_dem_percent+cong_rep_percent),
                                             TRUE ~ NA_real_)) %>%
  mutate(cong_parties_contesting = sum(cong_dem_percent > 0 | cong_rep_percent > 0 | cong_progressive_percent > 0))

elections <- icpsr8611_l %>%
  dplyr::select(scicp:cong_parties_contesting) %>%
  filter(year %in% c(1900, 1910, 1920)) %>%
  rename(RANGE = year) %>%
  mutate(RANGE = case_when(RANGE == 1900 ~ 10,
                           RANGE == 1910 ~ 1020,
                           RANGE == 1920 ~ 2030)) %>%
  as.data.table()

mlp_full <- merge.data.table(x = mlp_full,
                             y = elections,
                             by.x = c("SCICP_START", "RANGE"),
                             by.y = c("scicp", "RANGE"),
                             all.x = TRUE)

mlp_full <- merge.data.table(x = mlp_full,
                             y = elections,
                             by.x = c("SCICP_END", "RANGE"),
                             by.y = c("scicp", "RANGE"),
                             all.x = TRUE,
                             suffixes = c("_START", "_END"))

# > Railroad data ####

rr1900_counties_with_rail <- fread("counties1900_withRail.csv") # rail lines in county
rr1900_counties_with_rail_to_other_coal <- fread("counties1900_railToOtherCoalCounty.csv") # rail connections between coal producing counties
rr1900_connectedDecline <- fread("counties1900_connectedDecline.csv") #  decline in rail-connected counties
rr1900_connectedWeightedDeclineByGroup <- fread("counties1900_connectedWeightedDeclineByGroup.csv") #  decline in rail-connected counties, weighted by coethnic group size in county (overall and by concentration in coal/PCTGROUP)
rr1900_unconnectedWeightedDeclineByGroup <- fread("counties1900_unconnectedWeightedDeclineByGroup.csv") #  decline in non-rail-connected counties, weighted by coethnic group size in county (overall and by concentration in coal/PCTGROUP)
rr1900_counties_with_rail <- rr1900_counties_with_rail[, RANGE := 10]
rr1900_counties_with_rail_to_other_coal <- rr1900_counties_with_rail_to_other_coal[, RANGE := 10]
rr1900_connectedDecline <- rr1900_connectedDecline[, RANGE := 10]
rr1900_connectedWeightedDeclineByGroup <- rr1900_connectedWeightedDeclineByGroup[, RANGE := 10]
rr1900_unconnectedWeightedDeclineByGroup <- rr1900_unconnectedWeightedDeclineByGroup[, RANGE := 10]
colnames(rr1900_connectedWeightedDeclineByGroup)[2] <- "GROUP"
colnames(rr1900_unconnectedWeightedDeclineByGroup)[2] <- "GROUP"
colnames(rr1900_unconnectedWeightedDeclineByGroup)[5:6] <- c("unconnected_GROUP_COAL_START",
                                                             "unconnected_GROUP_WORKERS_START")

rr1910_counties_with_rail <- fread("counties1910_withRail.csv")
rr1910_counties_with_rail_to_other_coal <- fread("counties1910_railToOtherCoalCounty.csv")
rr1910_connectedDecline <- fread("counties1910_connectedDecline.csv")
rr1910_connectedWeightedDeclineByGroup <- fread("counties1910_connectedWeightedDeclineByGroup.csv")
rr1910_unconnectedWeightedDeclineByGroup <- fread("counties1910_unconnectedWeightedDeclineByGroup.csv")
rr1910_counties_with_rail <- rr1910_counties_with_rail[, RANGE := 1020]
rr1910_counties_with_rail_to_other_coal <- rr1910_counties_with_rail_to_other_coal[, RANGE := 1020]
rr1910_connectedDecline <- rr1910_connectedDecline[, RANGE := 1020]
rr1910_connectedWeightedDeclineByGroup <- rr1910_connectedWeightedDeclineByGroup[, RANGE := 1020]
rr1910_unconnectedWeightedDeclineByGroup <- rr1910_unconnectedWeightedDeclineByGroup[, RANGE := 1020]
colnames(rr1910_connectedWeightedDeclineByGroup)[2] <- "GROUP"
colnames(rr1910_unconnectedWeightedDeclineByGroup)[2] <- "GROUP"
colnames(rr1910_unconnectedWeightedDeclineByGroup)[5:6] <- c("unconnected_GROUP_COAL_START",
                                                             "unconnected_GROUP_WORKERS_START")

rr_counties_with_rail <- rbindlist(l = list(rr1900_counties_with_rail,
                                            rr1910_counties_with_rail))
rr_counties_with_rail_to_other_coal <- rbindlist(l = list(rr1900_counties_with_rail_to_other_coal,
                                                          rr1910_counties_with_rail_to_other_coal))
colnames(rr_counties_with_rail_to_other_coal)[2] <- "RAIL_COAL"
rr_connectedDecline <- rbindlist(l = list(rr1900_connectedDecline,
                                          rr1910_connectedDecline))
rr_connectedWeightedDeclineByGroup <- rbindlist(l = list(rr1900_connectedWeightedDeclineByGroup,
                                                         rr1910_connectedWeightedDeclineByGroup))
rr_unconnectedWeightedDeclineByGroup <- rbindlist(l = list(rr1900_unconnectedWeightedDeclineByGroup,
                                                           rr1910_unconnectedWeightedDeclineByGroup))

mlp_full <- merge.data.table(x = mlp_full,
                             y = rr_counties_with_rail,
                             by.x = c("SCICP_START", "RANGE"),
                             by.y = c("SCICP", "RANGE"),
                             all.x = TRUE)
mlp_full <- merge.data.table(x = mlp_full,
                             y = rr_counties_with_rail_to_other_coal,
                             by.x = c("SCICP_START", "RANGE"),
                             by.y = c("SCICP", "RANGE"),
                             all.x = TRUE)
mlp_full <- merge.data.table(x = mlp_full,
                             y = rr_connectedDecline,
                             by.x = c("SCICP_START", "RANGE"),
                             by.y = c("SCICP", "RANGE"),
                             all.x = TRUE)
mlp_full <- merge.data.table(x = mlp_full,
                             y = rr_connectedWeightedDeclineByGroup,
                             by.x = c("SCICP_START", "RANGE", "GROUP_START"),
                             by.y = c("SCICP", "RANGE", "GROUP"),
                             all.x = TRUE)
mlp_full <- merge.data.table(x = mlp_full,
                             y = rr_unconnectedWeightedDeclineByGroup,
                             by.x = c("SCICP_START", "RANGE", "GROUP_START"),
                             by.y = c("SCICP", "RANGE", "GROUP"),
                             all.x = TRUE,
                             suffixes = c("_connected", "_unconnected"))

mlp_full <- mlp_full[, connected_GROUP_COAL_PCTGROUP_START_SQRT := sqrt(connected_GROUP_COAL_START/connected_GROUP_WORKERS_START)]
mlp_full <- mlp_full[, unconnected_GROUP_COAL_PCTGROUP_START_SQRT := sqrt(unconnected_GROUP_COAL_START/unconnected_GROUP_WORKERS_START)]

mlp_full <- mlp_full[, base_decline_magnitude_yes_connection_sqrt := sqrt(base_decline_magnitude_yes_connection)]
mlp_full <- mlp_full[, base_decline_magnitude_no_connection_sqrt := sqrt(base_decline_magnitude_no_connection)]

mlp_full <- mlp_full[, BARTIK_MEAN_yes_connection_sqrt := sqrt(abs(BARTIK_MEAN_yes_connection))]
mlp_full <- mlp_full[, BARTIK_MEAN_no_connection_sqrt := sqrt(abs(BARTIK_MEAN_no_connection))]

# > States with non-citizen suffrage (Kleppner 1987) ####

noncitizen_suffrage <- tibble(RANGE = c(10, 1020,
                                        10, 1020,
                                        10, 1020,
                                        10, 1020,
                                        10,
                                        10, 1020,
                                        10),
                              STATEICP = c(22, 22,
                                           32, 32,
                                           34, 34,
                                           35, 35,
                                           72,
                                           37, 37,
                                           25),
                              noncitizen_suffrage = 1)
noncitizen_suffrage <- as.data.table(noncitizen_suffrage)

mlp_full <- merge.data.table(x = mlp_full,
                             y = noncitizen_suffrage,
                             by.x = c("RANGE", "STATEICP_START"),
                             by.y = c("RANGE", "STATEICP"),
                             all.x = TRUE)

mlp_full <- mlp_full[, noncitizen_suffrage := ifelse(is.na(noncitizen_suffrage),
                                                     0,
                                                     1)]

# > Creating derivative variables ####

# >> Transforming certain variables ####

mlp_full <- mlp_full[, MALE := ifelse(SEX_START == 1, 1, 0)]
mlp_full <- mlp_full[, YRIMMIG_START := ifelse(YRIMMIG_START == 0, NA, YRIMMIG_START)]
mlp_full <- mlp_full[, LIT_START := ifelse(LIT_START == 4, 1, 0)]

# >> Generating new RHS variables ####

mlp_full <- mlp_full[, GROUP_PCTCOUNTY_START := GROUP_COUNTY_START/TOTAL_COUNTY_START]
mlp_full <- mlp_full[, GROUP_COAL_PCTGROUP_START := GROUP_COAL_START/GROUP_WORKERS_START]
mlp_full <- mlp_full[, GROUP_COAL_PCTIND_START := GROUP_COAL_START/TOTAL_COUNTY_COAL_START]
mlp_full <- mlp_full[, COAL_RELIANCE_START := TOTAL_COUNTY_COAL_START/TOTAL_COUNTY_WORKERS_START]

mlp_full <- mlp_full[, GROUP_PCTCOUNTY_START_SQRT := sqrt(GROUP_PCTCOUNTY_START)]
mlp_full <- mlp_full[, GROUP_COAL_PCTGROUP_START_SQRT := sqrt(GROUP_COAL_PCTGROUP_START)]
mlp_full <- mlp_full[, GROUP_COAL_PCTIND_START_SQRT := sqrt(GROUP_COAL_PCTIND_START)]
mlp_full <- mlp_full[, COAL_RELIANCE_START_SQRT := sqrt(COAL_RELIANCE_START)]

mlp_full <- mlp_full[, INCWAGE_MEDIAN_START_LN := log1p(INCWAGE_MEDIAN_START)]

mlp_full <- mlp_full[, BLACK_PCTCOUNTY_START := TOTAL_NBLACK_COUNTY_START/TOTAL_COUNTY_START]
mlp_full <- mlp_full[, BLACK_PCTCOUNTY_START_SQRT := sqrt(BLACK_PCTCOUNTY_START)]

mlp_full <- mlp_full[, RURAL_PCTCOUNTY_START := RURAL_COUNTY_START/TOTAL_COUNTY_START]

# >> Generating outcome variables ####

mlp_full <- mlp_full[, NATURALIZING_END := ifelse(CITIZEN_END == 2 | CITIZEN_END == 4, 1, 0)] # declarations of intention
mlp_full <- mlp_full[, NATURALIZED_END := ifelse(CITIZEN_END == 2, 1, 0)] # successful naturalization

# >> Combined variables ####

mlp_full <- mlp_full[, STATEICP_RANGE := paste0(STATEICP_START, "-", RANGE)]
mlp_full <- mlp_full[, GROUP_SCICP := paste0(GROUP_START, "-", SCICP_START)]
mlp_full <- mlp_full[, RANGE_SCICP := paste0(RANGE, "-", SCICP_START)]

# >> Logarithmic ####

mlp_full <- mlp_full[, base_decline_magnitude_ln := log1p(base_decline_magnitude)]
mlp_full <- mlp_full[, GROUP_COAL_PCTGROUP_START_LN := log1p(GROUP_COAL_PCTGROUP_START)]
mlp_full <- mlp_full[, BARTIK_MEAN_LN := log1p(BARTIK_MEAN)]
mlp_full <- mlp_full[, GROUP_PCTCOUNTY_START_LN := log1p(GROUP_PCTCOUNTY_START)]
mlp_full <- mlp_full[, COAL_RELIANCE_START_LN := log1p(COAL_RELIANCE_START)]
mlp_full <- mlp_full[, BLACK_PCTCOUNTY_START_LN := log1p(BLACK_PCTCOUNTY_START)]

# *********************************** #
##################################### #
#### *** FAMILIAL TIES TO COAL *** ####
##################################### #
# *********************************** #

c1900 <- fread("1900_FULL_COUNT_FILE_HERE",
               select = c(HISTID = "character",
                          STATEICP = "integer",
                          COUNTYICP = "integer",
                          SERIAL = "integer",
                          FAMUNIT = "integer",
                          AGE = "integer",
                          RACE = "integer",
                          BPL = "integer",
                          IND1950 = "integer"))
c1910 <- fread("1910_FULL_COUNT_FILE_HERE",
               select = c(HISTID = "character",
                          STATEICP = "integer",
                          COUNTYICP = "integer",
                          SERIAL = "integer",
                          FAMUNIT = "integer",
                          AGE = "integer",
                          RACE = "integer",
                          BPL = "integer",
                          IND1950 = "integer"))

# > Constructing household family variable ####

c1900 <- c1900[, SERIAL_FAMUNIT := paste0(SERIAL, "-", FAMUNIT)]
c1910 <- c1910[, SERIAL_FAMUNIT := paste0(SERIAL, "-", FAMUNIT)]
c1920 <- c1920[, SERIAL_FAMUNIT := paste0(SERIAL, "-", FAMUNIT)]
c1930 <- c1930[, SERIAL_FAMUNIT := paste0(SERIAL, "-", FAMUNIT)]

# > Coal households ####

c1900_coal_hh <- unique(c1900[IND1950 == 216]$SERIAL_FAMUNIT)
c1910_coal_hh <- unique(c1910[IND1950 == 216]$SERIAL_FAMUNIT)

# > Household connections to coal ####

c1900_coal_hh_connections <- unique(c1900[SERIAL_FAMUNIT %in% c1900_coal_hh & IND1950 != 216]$HISTID)
c1910_coal_hh_connections <- unique(c1910[SERIAL_FAMUNIT %in% c1910_coal_hh & IND1950 != 216]$HISTID)

#save(c1900_coal_hh_connections,
#     c1910_coal_hh_connections,
#     file = "coal_hh_connections.RData")

#load(file = "coal_hh_connections.RData")

hh_connection <- data.table(HISTID_START = c(c1900_coal_hh_connections,
                                             c1910_coal_hh_connections),
                            RANGE = c(rep(10, length(c1900_coal_hh_connections)),
                                      rep(1020, length(c1910_coal_hh_connections))))

rm(list = c("c1900_coal_hh_connections", "c1910_coal_hh_connections"))

hh_connection <- hh_connection[, COAL_FAM_START := TRUE]

mlp_full <- merge.data.table(x = mlp_full,
                             y = hh_connection,
                             by = c("HISTID_START", "RANGE"),
                             all.x = TRUE)

mlp_full <- mlp_full[, COAL_FAM_START := ifelse(is.na(COAL_FAM_START), FALSE, TRUE)]

rm(hh_connection)

# ********************************************** #
################################################ #
#### *** MARRIAGE/LIVING WITH SPOUSE DATA *** ####
################################################ #
# ********************************************** #

# > 1900 census ####

c1900 <- fread("1900_FULL_COUNT_FILE_HERE",
               select = c(HISTID = "factor",
                          SERIAL = "integer",
                          RELATE = "integer",
                          FAMUNIT = "integer",
                          STATEICP = "integer",
                          COUNTYICP = "integer",
                          RACE = "integer",
                          HISPAN = "integer",
                          MTONGUE = "integer",
                          MTONGUED = "integer",
                          BPL = "integer",
                          BPLD = "integer",
                          FBPL = "integer",
                          MBPL = "integer",
                          IND1950 = "integer"))

c1900 <- c1900[RELATE %in% 1:2]

c1900 <- c1900[, GROUP := ethnicity(bpl = BPL, bpld = BPLD, mbpl = MBPL, fbpl = FBPL, race = RACE, hispan = HISPAN, mtongue = NA, mtongued = NA)]

c1900 <- c1900[, `:=` (MARRIED = .N), by = list(SERIAL, FAMUNIT)]
c1900 <- c1900[MARRIED == 2]

c1900 <- c1900[, `:=` (INTERMARRIAGE = uniqueN(GROUP)), by = list(SERIAL, FAMUNIT)]
c1900 <- c1900[, INTERMARRIAGE := ifelse(INTERMARRIAGE == 2, 1, 0)]

c1900 <- c1900[, NATVE_WHITE := ifelse(BPL < 100 & MBPL < 100 & FBPL < 100 & RACE == 1, 1, 0) ]
c1900 <- c1900[, `:=` (INTERMARRIAGE_NWHITE = sum(NATVE_WHITE == 1, na.rm = TRUE)), by = list(SERIAL, FAMUNIT)]
c1900 <- c1900[, INTERMARRIAGE_NWHITE := ifelse(INTERMARRIAGE_NWHITE == 1, 1, 0)]

# > 1910 census ####

c1910 <- fread("1910_FULL_COUNT_FILE_HERE",
               select = c(HISTID = "factor",
                          SERIAL = "integer",
                          RELATE = "integer",
                          FAMUNIT = "integer",
                          STATEICP = "integer",
                          COUNTYICP = "integer",
                          RACE = "integer",
                          HISPAN = "integer",
                          MTONGUE = "integer",
                          MTONGUED = "integer",
                          BPL = "integer",
                          BPLD = "integer",
                          FBPL = "integer",
                          MBPL = "integer",
                          IND1950 = "integer"))

c1910 <- c1910[RELATE %in% 1:2]

c1910 <- c1910[, GROUP := ethnicity(bpl = BPL, bpld = BPLD, mbpl = MBPL, fbpl = FBPL, race = RACE, hispan = HISPAN, mtongue = MTONGUE, mtongued = MTONGUED)]

c1910 <- c1910[, `:=` (MARRIED = .N), by = list(SERIAL, FAMUNIT)]
c1910 <- c1910[MARRIED == 2]

c1910 <- c1910[, `:=` (INTERMARRIAGE = uniqueN(GROUP)), by = list(SERIAL, FAMUNIT)]
c1910 <- c1910[, INTERMARRIAGE := ifelse(INTERMARRIAGE == 2, 1, 0)]

c1910 <- c1910[, NATVE_WHITE := ifelse(BPL < 100 & MBPL < 100 & FBPL < 100 & RACE == 1, 1, 0) ]
c1910 <- c1910[, `:=` (INTERMARRIAGE_NWHITE = sum(NATVE_WHITE == 1, na.rm = TRUE)), by = list(SERIAL, FAMUNIT)]
c1910 <- c1910[, INTERMARRIAGE_NWHITE := ifelse(INTERMARRIAGE_NWHITE == 1, 1, 0)]

# > 1920 census ####

c1920 <- fread("1920_FULL_COUNT_FILE_HERE",
               select = c(HISTID = "factor",
                          SERIAL = "integer",
                          RELATE = "integer",
                          FAMUNIT = "integer",
                          STATEICP = "integer",
                          COUNTYICP = "integer",
                          RACE = "integer",
                          HISPAN = "integer",
                          MTONGUE = "integer",
                          MTONGUED = "integer",
                          BPL = "integer",
                          BPLD = "integer",
                          FBPL = "integer",
                          MBPL = "integer",
                          IND1950 = "integer"))

c1920 <- c1920[RELATE %in% 1:2]

c1920 <- c1920[, GROUP := ethnicity(bpl = BPL, bpld = BPLD, mbpl = MBPL, fbpl = FBPL, race = RACE, hispan = HISPAN, mtongue = MTONGUE, mtongued = MTONGUED)]

c1920 <- c1920[, `:=` (MARRIED = .N), by = list(SERIAL, FAMUNIT)]
c1920 <- c1920[MARRIED == 2]

c1920 <- c1920[, `:=` (INTERMARRIAGE = uniqueN(GROUP)), by = list(SERIAL, FAMUNIT)]
c1920 <- c1920[, INTERMARRIAGE := ifelse(INTERMARRIAGE == 2, 1, 0)]

c1920 <- c1920[, NATVE_WHITE := ifelse(BPL < 100 & MBPL < 100 & FBPL < 100 & RACE == 1, 1, 0) ]
c1920 <- c1920[, `:=` (INTERMARRIAGE_NWHITE = sum(NATVE_WHITE == 1, na.rm = TRUE)), by = list(SERIAL, FAMUNIT)]
c1920 <- c1920[, INTERMARRIAGE_NWHITE := ifelse(INTERMARRIAGE_NWHITE == 1, 1, 0)]

# > Binding ####

c1900 <- c1900[, c("HISTID", "SERIAL", "MARRIED", "INTERMARRIAGE", "INTERMARRIAGE_NWHITE"), with = FALSE]
c1910 <- c1910[, c("HISTID", "SERIAL", "MARRIED", "INTERMARRIAGE", "INTERMARRIAGE_NWHITE"), with = FALSE]
c1920 <- c1920[, c("HISTID", "SERIAL", "MARRIED", "INTERMARRIAGE", "INTERMARRIAGE_NWHITE"), with = FALSE]

c1900 <- c1900[, RANGE := 10]
c1910 <- c1910[, RANGE := 1020]
c1920 <- c1920[, RANGE := 2030]

c1900 <- c1900[, RANGE_END := NA]
c1910 <- c1910[, RANGE_END := 10]
c1920 <- c1920[, RANGE_END := 1020]

marriage <- rbindlist(l = list(c1900, c1910, c1920))

marriage <- marriage[, INTERMARRIAGE_IMMIG := ifelse(INTERMARRIAGE == 1 & INTERMARRIAGE_NWHITE == 0,
                                                     1, 0)]

marriage <- marriage[, SERIAL := NULL]
marriage <- marriage[, RANGE := NULL]
marriage <- marriage[, RANGE_END := NULL]

# > Exporting ####

#fwrite(marriage, "marriage.csv")

# > Loading and merging ####

#marriage <- fread("marriage.csv")

marriage <- marriage[HISTID %in% mlp_full$HISTID_START | HISTID %in% mlp_full$HISTID_END]

mlp_full <- merge.data.table(x = mlp_full,
                             y = marriage,
                             by.x = "HISTID_START",
                             by.y = "HISTID",
                             all.x = TRUE)
mlp_full <- merge.data.table(x = mlp_full,
                             y = marriage,
                             by.x = "HISTID_END",
                             by.y = "HISTID",
                             all.x = TRUE,
                             suffixes = c("_START", "_END"))

rm(marriage)

mlp_full <- mlp_full[, INTERMARRIAGE_START := ifelse(is.na(INTERMARRIAGE_START), 0, INTERMARRIAGE_START)]
mlp_full <- mlp_full[, INTERMARRIAGE_NWHITE_START := ifelse(is.na(INTERMARRIAGE_NWHITE_START), 0, INTERMARRIAGE_NWHITE_START)]
mlp_full <- mlp_full[, INTERMARRIAGE_IMMIG_START := ifelse(is.na(INTERMARRIAGE_IMMIG_START), 0, INTERMARRIAGE_IMMIG_START)]
mlp_full <- mlp_full[, INTERMARRIAGE_END := ifelse(is.na(INTERMARRIAGE_END), 0, INTERMARRIAGE_END)]
mlp_full <- mlp_full[, INTERMARRIAGE_NWHITE_END := ifelse(is.na(INTERMARRIAGE_NWHITE_END), 0, INTERMARRIAGE_NWHITE_END)]
mlp_full <- mlp_full[, INTERMARRIAGE_IMMIG_END := ifelse(is.na(INTERMARRIAGE_IMMIG_END), 0, INTERMARRIAGE_IMMIG_END)]
mlp_full <- mlp_full[, MARRIED_START := ifelse(is.na(MARRIED_START), 0, 1)]
mlp_full <- mlp_full[, MARRIED_END := ifelse(is.na(MARRIED_END), 0, 1)]

# **************************** #
############################## #
#### *** SPEAKS ENGLISH *** ####
############################## #
# **************************** #

# > Loading complete-count census records ####

c1900 <- fread("1900_FULL_COUNT_FILE_HERE",
               select = c(HISTID = "factor",
                          SPEAKENG = "integer",
                          BPL = "integer"))
c1900 <- c1900[BPL %in% 400:499]
c1900 <- c1900[, BPL := NULL]
c1900 <- c1900[, SPEAKENG := fcase(SPEAKENG == 0 | SPEAKENG == 9, NA_real_,
                                   SPEAKENG == 1, 0,
                                   SPEAKENG == 2, 1)]
c1900 <- c1900[!is.na(SPEAKENG)]

c1910 <- fread("1910_FULL_COUNT_FILE_HERE",
               select = c(HISTID = "factor",
                          SPEAKENG = "integer",
                          BPL = "integer"))
c1910 <- c1910[BPL %in% 400:499]
c1910 <- c1910[, BPL := NULL]
c1910 <- c1910[, SPEAKENG := fcase(SPEAKENG == 0 | SPEAKENG == 9, NA_real_,
                                   SPEAKENG == 1, 0,
                                   SPEAKENG == 2, 1)]
c1910 <- c1910[!is.na(SPEAKENG)]

c1920 <- fread("1920_FULL_COUNT_FILE_HERE",
               select = c(HISTID = "factor",
                          SPEAKENG = "integer",
                          BPL = "integer"))
c1920 <- c1920[BPL %in% 400:499]
c1920 <- c1920[, BPL := NULL]
c1920 <- c1920[, SPEAKENG := fcase(SPEAKENG == 0 | SPEAKENG == 9, NA_real_,
                                   SPEAKENG == 1, 0,
                                   SPEAKENG == 2, 1)]
c1920 <- c1920[!is.na(SPEAKENG)]

speakeng <- rbindlist(l = list(c1900, c1910, c1920))

#fwrite(speakeng, "speakeng.csv")
#speakeng <- fread("speakeng.csv")

mlp_full <- merge.data.table(x = mlp_full,
                             y = speakeng,
                             by.x = "HISTID_START",
                             by.y = "HISTID",
                             all.x = TRUE)
mlp_full <- merge.data.table(x = mlp_full,
                             y = speakeng,
                             by.x = "HISTID_END",
                             by.y = "HISTID",
                             suffixes = c("_START", "_END"),
                             all.x = TRUE)

# ******************************** #
################################## #
#### *** COETHNIC COWORKERS *** ####
################################## #
# ******************************** #

# > Loading complete-count census records ####

c1900 <- fread("1900_FULL_COUNT_FILE_HERE",
               select = c(HISTID = "factor",
                          SERIAL = "integer",
                          RELATE = "integer",
                          STATEICP = "integer",
                          COUNTYICP = "integer",
                          URBAN = "integer",
                          FARM = "integer",
                          SEX = "integer",
                          AGE = "integer",
                          RACE = "integer",
                          HISPAN = "integer",
                          MTONGUE = "integer",
                          MTONGUED = "integer",
                          BPL = "integer",
                          BPLD = "integer",
                          FBPL = "integer",
                          MBPL = "integer",
                          CITIZEN = "integer",
                          YRIMMIG = "integer",
                          IND1950 = "integer"))
c1900 <- c1900[, SCICP := paste0(STATEICP, "-", COUNTYICP)]
c1900 <- c1900[, STATEICP := NULL]
c1900 <- c1900[, COUNTYICP := NULL]
c1900 <- c1900[, GROUP := ethnicity(bpl = BPL, bpld = BPLD, mbpl = MBPL, fbpl = FBPL, race = RACE, hispan = HISPAN, mtongue = NA, mtongued = NA)]

c1900 <- c1900[SCICP %in% usgs_yearbooks_coal_1900]

c1910 <- fread("1910_FULL_COUNT_FILE_HERE",
               select = ipums_col_set01)
c1910 <- c1910[, SCICP := paste0(STATEICP, "-", COUNTYICP)]
c1910 <- c1910[, STATEICP := NULL]
c1910 <- c1910[, COUNTYICP := NULL]
c1910 <- c1910[, GROUP := ethnicity(bpl = BPL, bpld = BPLD, mbpl = MBPL, fbpl = FBPL, race = RACE, hispan = HISPAN, mtongue = NA, mtongued = NA)]

c1910 <- c1910[SCICP %in% usgs_yearbooks_coal_1910]

# >> % coworkers who are coethnics (PCTIND) by group-industry-county ####

c1900 <- c1900[, c("SCICP", "HISTID", "IND1950", "GROUP"), with = FALSE]
c1910 <- c1910[, c("SCICP", "HISTID", "IND1950", "GROUP"), with = FALSE]

c1900 <- c1900[, RANGE := 10]
c1910 <- c1910[, RANGE := 1020]

pctind_all <- rbindlist(l = list(c1900, c1910))

pctind_all <- pctind_all[, `:=` (GROUP_IND = .N), by = list(SCICP, GROUP, IND1950, RANGE)]
pctind_all <- pctind_all[, `:=` (ALL_IND = .N), by = list(SCICP, IND1950, RANGE)]
pctind_all <- pctind_all[, IND_PCTIND := GROUP_IND/ALL_IND]

pctind_all <- pctind_all[, c("HISTID", "IND_PCTIND"), with = FALSE]

#fwrite(pctind_all, "pctind_by_histid.csv")
#pctind_all <- fread("pctind_by_histid.csv")

mlp_full <- merge.data.table(x = mlp_full,
                             y = pctind_all,
                             by.x = "HISTID_START",
                             by.y = "HISTID",
                             all.x = TRUE)

rm(pctind_all)

# *********************** #
######################### #
#### *** EXPORTING *** ####
######################### #
# *********************** #

fwrite(mlp_full, "mlp_full.csv") # saving file for use in analysis.R

